# -*- coding: utf-8 -*-
import scrapy
import re
from copy import deepcopy


class BilianSpider(scrapy.Spider):
    name = 'bilian'
    allowed_domains = ['ebnew.com', 'ss.ebnew.com']

    keyword_s = [
        '路由器', '变压器'
    ]

    # 存储数据的格式
    sql_data = dict(
        projectcode='',  # 项目编号
        web='',  # 信息来源网站
        keyword='',  # 关键字
        detail_url='',  # 招标详细页网址
        title='',  # 第三方网站发布标题
        toptype='',  # 信息类型
        province='',  # 归属省份
        product='',  # 产品范畴
        industry='',  # 归属行业
        tendering_manner='',  # 招标方式
        publicity_date='',  # 招标公示日期
        expiry_date='',  # 招标截止时间
    )
    # Form表单的数据格式
    form_data = dict(
        infoClassCodes='',
        rangeType='',
        projectType='bid',
        fundSourceCodes='',
        dateType='',
        startDateCode='',
        endDateCode='',
        normIndustry='',
        normIndustryName='',
        zone='',
        zoneName='',
        zoneText='',
        key='',  # 搜索的关键字
        pubDateType='',
        pubDateBegin='',
        pubDateEnd='',
        sortMethod='timeDesc',
        orgName='',
        currentPage='',  # 当前页码
    )

    def start_requests(self):
        for keyword in self.keyword_s:
            form_data = deepcopy(self.form_data)
            form_data['key'] = keyword
            form_data['currentPage'] = '1'
            request = scrapy.FormRequest(
                url='http://ss.ebnew.com/tradingSearch/index.htm',
                formdata=form_data,
                callback=self.parse_start,
            )
            request.meta['form_data'] = form_data
            yield request

        # yield scrapy.Request(
        #     url='http://www.ebnew.com/businessShow/622918966.html',
        #     callback=self.parse_page2,
        # )

        # form_data = self.form_data
        # form_data['key'] = '路由器'
        # form_data['currentPage'] = '2'
        #
        # yield scrapy.FormRequest(
        #     url='http://ss.ebnew.com/tradingSearch/index.htm',
        #     formdata=form_data,
        #     callback=self.parse_page1,
        # )

    def parse_start(self, response):
        a_text_s = response.xpath('//form[@id="pagerSubmitForm"]/a/text()').extract()
        page_max = max(
            [int(a_text) for a_text in a_text_s if re.match('\d+', a_text)]
        )
        # page_max = 2
        self.parse_page1(response)
        for page in range(2, page_max + 1):
            form_data = deepcopy(response.meta['form_data'])
            form_data['currentPage'] = str(page)
            request = scrapy.FormRequest(
                url='http://ss.ebnew.com/tradingSearch/index.htm',
                formdata=form_data,
                callback=self.parse_page1,
            )
            request.meta['form_data'] = form_data
            yield request

    def parse_page1(self, response):
        form_data = response.meta['form_data']
        keyword = form_data.get('key')
        content_list_x_s = response.xpath('//div[@class="ebnew-content-list"]/div')
        for content_list_x in content_list_x_s:
            sql_data = deepcopy(self.sql_data)
            sql_data['toptype'] = content_list_x.xpath('./div[1]/i[1]/text()').extract_first()
            sql_data['title'] = content_list_x.xpath('./div[1]/a/text()').extract_first()
            sql_data['publicity_date'] = content_list_x.xpath('./div[1]/i[2]/text()').extract_first()
            if sql_data['publicity_date']:
                # 匹配纯数字
                sql_data['publicity_date'] = re.sub('[^0-9\-]', '', sql_data['publicity_date'])

            sql_data['tendering_manner'] = content_list_x.xpath('./div[2]/div[1]/p[1]/span[2]/text()').extract_first()
            sql_data['product'] = content_list_x.xpath('./div[2]/div[1]/p[2]/span[2]/text()').extract_first()
            sql_data['expiry_date'] = content_list_x.xpath('./div[2]/div[2]/p[1]/span[2]/text()').extract_first()
            sql_data['province'] = content_list_x.xpath('./div[2]/div[2]/p[2]/span[2]/text()').extract_first()

            sql_data['detail_url'] = content_list_x.xpath('./div[1]/a/@href').extract_first()
            sql_data['keyword'] = keyword
            sql_data['web'] = '必联网'

            request = scrapy.Request(
                url=sql_data['detail_url'],
                callback=self.parse_page2,
            )
            request.meta['sql_data'] = sql_data
            # print(sql_data)
            yield request

    def parse_page2(self, response):
        sql_data = response.meta['sql_data']

        sql_data['projectcode'] = response.xpath('//ul[@class="ebnew-project-information"]/li[1]/span[2]/text()').extract_first()
        sql_data['industry'] = response.xpath('//ul[@class="ebnew-project-information"]/li[8]/span[2]/text()').extract_first()

        if not sql_data['projectcode']:
            projectcode_find = re.findall('项目编号[:：]{0,1}\s{0,2}([a-zA-Z0-9\-_]{10,80})', response.body.decode('utf-8'))
            sql_data['projectcode'] = projectcode_find[0] if projectcode_find else ""

        # print('-----------', sql_data[])
        yield sql_data

